{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -U firecrawl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from firecrawl import FirecrawlApp\n",
    "from dotenv import load_dotenv\n",
    "import pandas as pd\n",
    "from typing import Dict, Any\n",
    "from pydantic import BaseModel\n",
    "import time\n",
    "\n",
    "class WebsiteScraper:\n",
    "    def __init__(self):\n",
    "        load_dotenv()\n",
    "        self.firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
    "        self.app = FirecrawlApp(api_key=self.firecrawl_api_key)\n",
    "        self.schema_fields = [{\"name\": \"\", \"type\": \"str\"}]\n",
    "\n",
    "    def create_dynamic_model(self, fields):\n",
    "        \"\"\"Create a dynamic Pydantic model from schema fields.\"\"\"\n",
    "        field_annotations = {}\n",
    "        for field in fields:\n",
    "            if field[\"name\"]:\n",
    "                type_mapping = {\n",
    "                    \"str\": str,\n",
    "                    \"bool\": bool,\n",
    "                    \"int\": int,\n",
    "                    \"float\": float\n",
    "                }\n",
    "                field_annotations[field[\"name\"]] = type_mapping[field[\"type\"]]\n",
    "        \n",
    "        return type(\n",
    "            \"ExtractSchema\",\n",
    "            (BaseModel,),\n",
    "            {\n",
    "                \"__annotations__\": field_annotations\n",
    "            }\n",
    "        )\n",
    "\n",
    "    def create_schema_from_fields(self, fields):\n",
    "        \"\"\"Create schema using Pydantic model.\"\"\"\n",
    "        if not any(field[\"name\"] for field in fields):\n",
    "            return None\n",
    "        \n",
    "        model_class = self.create_dynamic_model(fields)\n",
    "        return model_class.model_json_schema()\n",
    "\n",
    "    def convert_to_table(self, data: Dict[str, Any]) -> str:\n",
    "        \"\"\"Convert data to a pandas DataFrame and return as string.\"\"\"\n",
    "        if not data or 'data' not in data:\n",
    "            return \"\"\n",
    "        \n",
    "        df = pd.DataFrame([data['data']])\n",
    "        return df.to_string(index=False)\n",
    "\n",
    "    def scrape_website(self, website_url: str, prompt: str, schema_fields=None):\n",
    "        \"\"\"Main function to scrape website data.\"\"\"\n",
    "        if not website_url:\n",
    "            raise ValueError(\"Please provide a website URL\")\n",
    "\n",
    "        try:\n",
    "            schema = self.create_schema_from_fields(schema_fields) if schema_fields else None\n",
    "            \n",
    "            extract_params = {'prompt': prompt}\n",
    "            if schema:\n",
    "                extract_params['schema'] = schema\n",
    "\n",
    "            data = self.app.extract([website_url,],\n",
    "                                    extract_params\n",
    "                                    )\n",
    "            \n",
    "            return data\n",
    "            \n",
    "        except Exception as e:\n",
    "            raise Exception(f\"An error occurred: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scraper = WebsiteScraper()\n",
    "    \n",
    "# Get user input\n",
    "website_url = \"https://blog.dailydoseofds.com/*\"\n",
    "prompt = \"extract publish date, title and link of all articles related to LLMs\"\n",
    "    \n",
    "# Optional: Add schema fields\n",
    "schema_fields = [\n",
    "    {\"name\": \"Article_title\", \"type\": \"str\"},\n",
    "    {\"name\": \"Publish_date\", \"type\": \"str\"},\n",
    "    {\"name\": \"Article_link\", \"type\": \"str\"}\n",
    "]\n",
    "\n",
    "# Get results\n",
    "result = scraper.scrape_website(website_url, prompt, [])\n",
    "print(\"Results:\\n\")\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result['data']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ExtractSchema(BaseModel):\n",
    "    mission: str\n",
    "    supports_sso: bool\n",
    "    is_open_source: bool\n",
    "    is_in_yc: bool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ExtractSchema.model_json_schema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scraper.create_schema_from_fields(schema_fields)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from firecrawl import FirecrawlApp\n",
    "from pydantic import BaseModel, Field\n",
    "\n",
    "# Initialize the FirecrawlApp with your API key\n",
    "app = FirecrawlApp(api_key=os.getenv(\"FIRECRAWL_API_KEY\"))\n",
    "\n",
    "class ExtractSchema(BaseModel):\n",
    "    article_title: str\n",
    "    publish_date: str\n",
    "    article_link: str\n",
    "\n",
    "data = app.extract([\n",
    "  \"https://blog.dailydoseofds.com/*\"], {\n",
    "    'prompt': 'Extract the article title, publish date, and article link of all articles related to LLMs.',\n",
    "    'schema': ExtractSchema.model_json_schema(),\n",
    "})\n",
    "print(data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
